G588 Accompanying R Markdown Document

This is an R Markdown document to accompany a final project paper for G588: Applied Spatial Statistics instructed by Dr. Scott Robeson.

Exploring the Microsoft Building Footprint Dataset

In 2018, Microsoft released a computer-generated buildings footprints GIS layer for the US as open data. This document demonstrates some of the methods us in R to explore this dataset. For additional context refer to the accompanying paper.

A brief look at the largest building in Indiana. The largest “building” identified through the algorithm appears to be a series of buildings of a rail loading area of US Steelworks in Gary:

setwd(‘D:/G588’) Largest building in Indiana?. The second largest building in Indiana according to the datasets is the Toyota Plant in near Princeton.

It is 328924.2449 square meters which is over 81 acres: Largest building in Indiana?.

shape <- readOGR(dsn = ".", layer = "footprints_plus_census2000")
## OGR data source with driver: ESRI Shapefile 
## Source: "D:\G588", layer: "footprints_plus_census2000"
## with 92 features
## It has 29 fields
## Integer64 fields read as strings:  NCAPC_1 CNTY_FIPS STFID POP2000 AGE_65_UP HOUSEHOLDS FAMILIES HSE_UNITS VACANT OWNER_OCC RENTER_OCC FREQUENCY COUNT_OBJE
dat<-shape@data

ufdat<-unfactor(dat)
#bxp<-boxplot(dat$SUM_SHAPE_, horizontal=TRUE, axes=FALSE)
#mtext(c("Min","Max"),  at=bxp$stats[c(1,5)], line=-3)
#plot(dat[20:29])
#plot(dat$HOUSEHOLDS,dat$FREQUENCY)
#par(mfrow=c(2,3),pty='s') 
summary(ufdat$HOUSEHOLDS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2201    8028   12642   25395   25149  352164
hist(ufdat$FREQUENCY, probability=T)

br<-(ufdat$POP2000) / (ufdat$FREQUENCY)
hist((ufdat$POP2000) / (ufdat$FREQUENCY))

plot(br,ufdat$POP2000,)
text(br,ufdat$POP2000,labels=ufdat$NAME_U)

sums<-(ufdat$POP2000/ ufdat$FREQUENCY)
hist(sums,
     breaks=20,
     main='Persons per building Indiana Counties',
     xlab='Persons per building',
ylab='# Counties')
lines(density(sums), col='red')

#sums is buildings per person


plot(sums,ufdat$POP2000,)
text(sums,ufdat$POP2000,labels=dat$NAME_U,)

plot((ufdat$POP2000/ ufdat$FREQUENCY), ufdat$POP2000, main="Persons per building", sub="Indiana Counties",
     xlab="buildings per capita", ylab="population",
     xlim=c(1, 2.8), ylim=c(1, 1000000))
text(sums,ufdat$POP2000,labels=dat$NAME_U,)

sums
##  [1] 1.304095 1.219358 2.075708 2.282572 2.322655 2.071341 1.883506 1.571852
##  [9] 1.576822 1.602102 1.541634 1.412146 1.359920 2.120671 1.564495 1.362454
## [17] 1.367218 1.249432 1.733095 1.920458 1.554589 1.564681 1.523683 1.243700
## [25] 1.666260 1.270361 1.240923 2.105893 1.288162 1.560542 1.909604 2.475158
## [33] 1.203574 1.620228 1.355881 1.951736 1.813630 1.434484 1.326411 1.676621
## [41] 1.597224 1.471891 1.594151 1.525841 1.807245 1.530307 1.349272 2.615680
## [49] 1.601529 1.573913 1.644896 1.213759 1.069101 1.544107 1.872039 1.668309
## [57] 2.258765 1.437558 1.195929 1.358653 1.277177 1.763000 2.374877 1.166875
## [65] 1.176082 1.645986 1.628435 1.440657 1.352409 1.476145 1.195366 1.571111
## [73] 1.010028 1.531717 1.904522 1.258221 1.317662 1.502519 1.241869 1.359961
## [81] 1.762335 1.422855 1.659433 1.407878 1.158275 1.211613 2.017060 1.441571
## [89] 1.787877 1.511366 1.275314 2.142918
hist(unfactor(dat$POP2000))

plot(unfactor(dat$POP2000),unfactor(dat$FREQUENCY))
text(unfactor(dat$POP2000),unfactor(dat$FREQUENCY),labels=dat$NAME_U)

qqnorm(dat$AVE_HH_SZ,main="Q-Q Plot of HH SIZE",pch=19) 
qqline(dat$AVE_HH_SZ)

qqnorm(dat$MEAN_SHAPE,main="Q-Q Plot of Building Size",pch=19) 
qqline(dat$MEAN_SHAPE)

#why is mean building size so different
hist(dat$MEAN_SHAPE)

lines(ufdat)
## Warning in data.matrix(x): NAs introduced by coercion

## Warning in data.matrix(x): NAs introduced by coercion

## Warning in data.matrix(x): NAs introduced by coercion
qqline(dat$MEAN_SHAPE)

summary(dat$MEAN_SHAPE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   168.4   218.1   239.9   238.0   254.5   326.3
hist(dat$AVE_HH_SZ)

plot(dat$AREA,dat$SUM_SHAPE)
text(dat$AREA,dat$SUM_SHAPE, labels=dat$NAME_U)

plot(dat$AREA,dat$SUM_SHAPE, main="Total area and Total area with buildings", sub="Indiana Counties",
     xlab="Total Area of County", ylab="Total Area with Buildings"
     )
text(dat$AREA,dat$SUM_SHAPE,labels=dat$NAME_U,)

#square meters under roof divided by population gives us a range of 100 meters to 220 meters under roof per person
#suprisingligly Marion and Lake counties are near bottom at 100 m2 and less populous areas have higher density per capita? 
#density and height(# of stories) not accounted for in footprints
plot(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000)
text(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000, labels=dat$NAME_U)

#percentage of county that has buildings
plot(ufdat$SUM_SHAPE/ufdat$AREA,ufdat$POP2000, xlab="Percentage of total area of a county covered by buildings", ylab="pop", main="Percentage of total area of a county covered by buildings")
text(ufdat$SUM_SHAPE/ufdat$AREA,ufdat$POP2000, labels=dat$NAME_U, )

summary(ufdat$SUM_SHAPE/ufdat$AREA)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001599 0.003711 0.005092 0.008528 0.008308 0.088437
plot(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000, main="Square meters of building footprints per capita", sub="Indiana Counties",
     xlab="Square meters of building footprints per capita", ylab="Population"
)
text(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000,labels=dat$NAME_U,)

plot(ufdat$SUM_SHAPE/ufdat$AREA, ufdat$AREA)
text(ufdat$SUM_SHAPE/ufdat$AREA, ufdat$AREA, labels=ufdat$NAME_U)

#percentage of county that has buildings
summary(ufdat$SUM_SHAPE/ufdat$AREA)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001599 0.003711 0.005092 0.008528 0.008308 0.088437
ufdat$SUM_SHAPE/ufdat$AREA
##  [1] 0.007050724 0.007425515 0.023720053 0.023545142 0.030260151 0.013174022
##  [7] 0.008944985 0.007338194 0.006620017 0.006196689 0.008408619 0.004145028
## [13] 0.006409588 0.025016424 0.003652034 0.002476523 0.003525295 0.002268585
## [19] 0.004931510 0.005180272 0.005433080 0.006874246 0.004958680 0.003432774
## [25] 0.005225933 0.001750595 0.003838459 0.008273882 0.004401003 0.004726856
## [31] 0.014670236 0.013212550 0.001599072 0.004639839 0.004351759 0.013802850
## [37] 0.014078712 0.002452151 0.004072386 0.031472408 0.004572002 0.008168108
## [43] 0.003312692 0.006795101 0.009194889 0.011490726 0.002235702 0.088436832
## [49] 0.019167459 0.004163502 0.006285973 0.003271435 0.003088848 0.006317266
## [55] 0.019873244 0.008018453 0.011222922 0.004016710 0.003730747 0.002802569
## [61] 0.005003585 0.009842538 0.011927105 0.002632572 0.004358224 0.007810856
## [67] 0.002460359 0.004012151 0.003376408 0.005212962 0.004053974 0.005214051
## [73] 0.002997272 0.005226974 0.003834771 0.002517078 0.004997292 0.006384830
## [79] 0.003402068 0.002961370 0.013766366 0.002083394 0.003893220 0.007720234
## [85] 0.001981555 0.004521076 0.021397327 0.002820532 0.007182648 0.004048591
## [91] 0.003789111 0.033423764
plot(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000)
text(dat$SUM_SHAPE/ufdat$POP2000, ufdat$POP2000, labels=dat$NAME_U)
text(dat$AVE_HH_SZ,dat$MEAN_SHAPE, labels=dat$NAME_U)

dat.numeric<-shape@data[,sapply(shape@data, is.numeric)]